"""Data loading utilities for the fractal‑pivot calibration pipeline.

This module provides a unified interface for loading point data from a
variety of formats.  The goal is to return a NumPy array of purely
numeric values regardless of whether the source file lives in a CSV,
NumPy binary or inside a ZIP archive.  Any non‑numeric columns are
discarded to ensure downstream algorithms operate only on floating
point coordinates.

Supported file types
---------------------
* ``.csv`` – comma separated values with a header row.  Only numeric
  columns are retained.
* ``.npy`` – a NumPy array saved with ``numpy.save``.  If the array is
  one‑dimensional it will be reshaped to ``(n, 1)``.  Non‑numeric
  dtypes are cast to floats if possible.
* ``.zip`` – a ZIP archive containing one or more data files.  The
  loader searches for the first file ending in ``.csv`` or ``.npy``
  inside the archive and loads that.  This behaviour is sufficient
  for the synthetic LIDAR and DEM data generated as part of this
  exercise.

If the file extension is not recognised a ``ValueError`` will be
raised.  All paths are interpreted relative to the project root.

Example
-------
>>> import data_loader
>>> points = data_loader.load_dataset('data/barnsley_fern.csv')
>>> points.shape
(50000, 2)
"""

from __future__ import annotations

import io
import zipfile
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd

def _load_csv(path: Path) -> np.ndarray:
    """Load a CSV file and return only numeric columns.

    Parameters
    ----------
    path : Path
        Path to the CSV file.

    Returns
    -------
    np.ndarray
        An array of shape ``(n_samples, n_features)`` containing only
        numeric values.
    """
    df = pd.read_csv(path)
    # Select numeric columns; ignore any categorical/object columns
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.empty:
        raise ValueError(f"No numeric columns found in {path}")
    return numeric_df.to_numpy()

def _load_npy(path: Path) -> np.ndarray:
    """Load a NumPy binary file and ensure the result is 2D.

    Parameters
    ----------
    path : Path
        Path to the ``.npy`` file.

    Returns
    -------
    np.ndarray
        Two‑dimensional numeric array.
    """
    arr = np.load(path, allow_pickle=False)
    if arr.ndim == 1:
        arr = arr.reshape(-1, 1)
    elif arr.ndim > 2:
        # Flatten any higher dimensions into a single feature axis
        arr = arr.reshape(arr.shape[0], -1)
    return arr.astype(float)

def _load_from_zip(path: Path) -> np.ndarray:
    """Extract the first CSV or NPY file from a ZIP archive and load it.

    The loader iterates over the members of the archive and picks the
    first entry whose name ends with ``.csv`` or ``.npy`` (case
    insensitive).  Other files are ignored.  If no appropriate file is
    found a ``ValueError`` is raised.

    Parameters
    ----------
    path : Path
        Path to the ZIP archive.

    Returns
    -------
    np.ndarray
        Numeric array extracted from the archive.
    """
    with zipfile.ZipFile(path, 'r') as zf:
        for name in zf.namelist():
            lower = name.lower()
            if lower.endswith('.csv'):
                with zf.open(name) as f:
                    # Read into a pandas DataFrame for type selection
                    df = pd.read_csv(f)
                    numeric_df = df.select_dtypes(include=[np.number])
                    if numeric_df.empty:
                        continue
                    return numeric_df.to_numpy()
            elif lower.endswith('.npy'):
                with zf.open(name) as f:
                    data = f.read()
                    arr = np.load(io.BytesIO(data), allow_pickle=False)
                    if arr.ndim == 1:
                        arr = arr.reshape(-1, 1)
                    elif arr.ndim > 2:
                        arr = arr.reshape(arr.shape[0], -1)
                    return arr.astype(float)
    raise ValueError(f"No supported data file (.csv/.npy) found inside {path}")

def load_dataset(path: str | Path) -> np.ndarray:
    """Load a dataset from disk into a 2D numeric array.

    Parameters
    ----------
    path : str or Path
        Path to the dataset.  The file extension determines how the
        loader interprets the file.

    Returns
    -------
    np.ndarray
        Two‑dimensional array where each row corresponds to a sample
        and each column corresponds to a numeric feature.
    """
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Dataset file not found: {p}")
    suffix = p.suffix.lower()
    if suffix == '.csv':
        return _load_csv(p)
    elif suffix == '.npy':
        return _load_npy(p)
    elif suffix == '.zip':
        return _load_from_zip(p)
    else:
        raise ValueError(f"Unsupported dataset type: {suffix} (file {p})")